## importing the dataset and omitting the missing values
datafile <- read.csv("../WiscNursingHome.csv")
datafile <- na.omit(datafile)
## dividing data from different years
datafile$ORGSTR <- as.factor(datafile$ORGSTR)
datafile$MSA <- as.factor(datafile$MSA)
datafile$UTILIZATION_RATE <- datafile$TPY / datafile$NUMBED
# Assume the data is stored in a dataframe called "nursing_home_data"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Count the number of occurrences of each hospID
hospID_counts <- datafile %>% group_by(hospID) %>% summarise(count = n())
# Keep only the hospID that appear more than once
to_keep <- hospID_counts$hospID[hospID_counts$count > 1]
# Filter the original dataframe to keep only the rows with hospID that appear more than once
datafile_filtered <- datafile %>% filter(hospID %in% to_keep)
library("dplyr")
years_comparison <- datafile %>%
group_by(hospID) %>%
summarise(SQRFOOT_2000 = SQRFOOT[CRYEAR == 2000], SQRFOOT_2001 = SQRFOOT[CRYEAR == 2001],
NUMBED_2000 = NUMBED[CRYEAR == 2000], NUMBED_2001 = NUMBED[CRYEAR == 2001],
TPY_2000 = TPY[CRYEAR == 2000], TPY_2001 = TPY[CRYEAR == 2001],
NUMBED_2000 = NUMBED[CRYEAR == 2000], NUMBED_2001 = NUMBED[CRYEAR == 2001],
MSA_2000 = MSA[CRYEAR == 2000], MSA_2001 = MSA[CRYEAR == 2001],
URBAN_2000 = URBAN[CRYEAR == 2000], URBAN_2001 = URBAN[CRYEAR == 2001],
TAXEXEMPT_2000 = TAXEXEMPT[CRYEAR == 2000], TAXEXEMPT_2001 = TAXEXEMPT[CRYEAR == 2001],
PRO_2000 = PRO[CRYEAR == 2000], PRO_2001 = PRO[CRYEAR == 2001],
MCERT_2000 = MCERT[CRYEAR == 2000], MCERT_2001 = MCERT[CRYEAR == 2001],
SELFFUNDINS_2000 = SELFFUNDINS[CRYEAR == 2000], SELFFUNDINS_2001 = SELFFUNDINS[CRYEAR == 2001],
ORGSTR_2000 = ORGSTR[CRYEAR == 2000], ORGSTR_2001 = ORGSTR[CRYEAR == 2001], UTILIZATION_RATE_2000 = UTILIZATION_RATE[CRYEAR == 2000], UTILIZATION_RATE_2001 = UTILIZATION_RATE[CRYEAR == 2001])
## `summarise()` has grouped output by 'hospID'. You can override using the
## `.groups` argument.
years_comparison
## # A tibble: 346 × 23
## # Groups: hospID [346]
## hospID SQRFOOT_2000 SQRFOOT…¹ NUMBE…² NUMBE…³ TPY_2…⁴ TPY_2…⁵ MSA_2…⁶ MSA_2…⁷
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct>
## 1 101 10.9 10.9 18 18 16.5 16.7 0 0
## 2 103 19.8 19.8 63 50 59.2 45.6 0 0
## 3 105 26.9 26.9 54 54 49.6 52.4 1 1
## 4 107 26.3 26.3 60 60 51.9 57.4 0 0
## 5 108 30.7 30.7 104 104 94.6 95.6 10 10
## 6 109 24.3 24.3 79 79 69.7 70.2 11 11
## 7 110 34.4 34.4 105 105 95.6 97.7 11 11
## 8 111 52.1 46.0 129 108 95.2 98.4 6 6
## 9 112 71.8 71.8 200 200 193. 183. 10 10
## 10 113 63.4 63.4 184 184 156. 153. 6 6
## # … with 336 more rows, 14 more variables: URBAN_2000 <int>, URBAN_2001 <int>,
## # TAXEXEMPT_2000 <int>, TAXEXEMPT_2001 <int>, PRO_2000 <int>, PRO_2001 <int>,
## # MCERT_2000 <int>, MCERT_2001 <int>, SELFFUNDINS_2000 <int>,
## # SELFFUNDINS_2001 <int>, ORGSTR_2000 <fct>, ORGSTR_2001 <fct>,
## # UTILIZATION_RATE_2000 <dbl>, UTILIZATION_RATE_2001 <dbl>, and abbreviated
## # variable names ¹SQRFOOT_2001, ²NUMBED_2000, ³NUMBED_2001, ⁴TPY_2000,
## # ⁵TPY_2001, ⁶MSA_2000, ⁷MSA_2001
data_2000 <- datafile_filtered[datafile_filtered$CRYEAR == 2000, ]
data_2001 <- datafile_filtered[datafile_filtered$CRYEAR == 2001, ]
data_2001$PREV_UTILIZATION_RATE <- data_2000$UTILIZATION_RATE
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
For the moment we just work on the data coming from the 2000 survey, to avoid dependency between observations.
First we consider the quantitative variables only.
Let’s see the correlation:
ggpairs(subset(data_2001, select = c(SQRFOOT, NUMBED, TPY, PREV_UTILIZATION_RATE)))
cor(log(data_2001$TPY),(data_2001$PREV_UTILIZATION_RATE))
## [1] 0.2124294
We can see the strong correlation between all the three variables. In particular we see the almost perfectly linear correlation between TPY and NUMBED.
Being NUMBED the most linearly correlated variable w.r.t. PTY, we start modelling PTY using NUMBED and subsequently add SQRFOOT and the interaction between the two:
summary(lm(TPY ~ PREV_UTILIZATION_RATE + SQRFOOT, data = data_2001))
##
## Call:
## lm(formula = TPY ~ PREV_UTILIZATION_RATE + SQRFOOT, data = data_2001)
##
## Residuals:
## Min 1Q Median 3Q Max
## -101.081 -15.096 -3.165 15.082 90.070
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1600 14.7886 0.078 0.938
## PREV_UTILIZATION_RATE 38.2421 16.2273 2.357 0.019 *
## SQRFOOT 1.0621 0.0419 25.348 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.37 on 343 degrees of freedom
## Multiple R-squared: 0.6605, Adjusted R-squared: 0.6585
## F-statistic: 333.6 on 2 and 343 DF, p-value: < 2.2e-16
The t test related to SQRFOOT does not give enough evidence against the null hypotesis. To understand better the role of the variable we can perform the analysis of variance:
anova(glm(TPY ~ NUMBED + SQRFOOT + NUMBED:SQRFOOT, data = data_2001, family = gaussian),
test = "Chisq")
## Analysis of Deviance Table
##
## Model: gaussian, link: identity
##
## Response: TPY
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 345 702732
## NUMBED 1 683048 344 19684 < 2.2e-16 ***
## SQRFOOT 1 941 343 18743 3.398e-05 ***
## NUMBED:SQRFOOT 1 4 342 18739 0.7785
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
As we can see, despite being NUMBED and SQRFOOT highly correlated, it is worth including the second variable to the model too. Adding the interaction, instead, seems not to give any important information to the model.
We can also use Akaike information criterium to check this result (here we compare some of the possible combinations):
AIC(lm(TPY ~ NUMBED, data = data_2001),
lm(TPY ~ SQRFOOT, data = data_2001),
lm(TPY ~ NUMBED + SQRFOOT, data = data_2001),
lm(TPY ~ NUMBED:SQRFOOT, data = data_2001),
lm(TPY ~ NUMBED + SQRFOOT + NUMBED:SQRFOOT, data = data_2001),
lm(TPY ~ NUMBED + NUMBED:SQRFOOT, data = data_2001),
lm(TPY ~ SQRFOOT + NUMBED:SQRFOOT, data = data_2001))
## df AIC
## lm(TPY ~ NUMBED, data = data_2001) 3 2386.140
## lm(TPY ~ SQRFOOT, data = data_2001) 3 3254.947
## lm(TPY ~ NUMBED + SQRFOOT, data = data_2001) 4 2371.185
## lm(TPY ~ NUMBED:SQRFOOT, data = data_2001) 3 3149.517
## lm(TPY ~ NUMBED + SQRFOOT + NUMBED:SQRFOOT, data = data_2001) 5 2373.105
## lm(TPY ~ NUMBED + NUMBED:SQRFOOT, data = data_2001) 4 2379.298
## lm(TPY ~ SQRFOOT + NUMBED:SQRFOOT, data = data_2001) 4 3150.156
Again the analysis indicates NUMBED as the most relevant variable and the interaction between NUMBED and SQRFOOT as basically non relevant.
We can try to improve the model by log-transforming the predictors:
AIC(lm(TPY ~ log(NUMBED), data = data_2001),
lm(TPY ~ log(SQRFOOT), data = data_2001),
lm(TPY ~ log(NUMBED) + log(SQRFOOT), data = data_2001),
lm(TPY ~ log(NUMBED) + SQRFOOT, data = data_2001),
lm(TPY ~ NUMBED + log(SQRFOOT), data = data_2001))
## df AIC
## lm(TPY ~ log(NUMBED), data = data_2001) 3 2931.458
## lm(TPY ~ log(SQRFOOT), data = data_2001) 3 3283.465
## lm(TPY ~ log(NUMBED) + log(SQRFOOT), data = data_2001) 4 2926.603
## lm(TPY ~ log(NUMBED) + SQRFOOT, data = data_2001) 4 2825.379
## lm(TPY ~ NUMBED + log(SQRFOOT), data = data_2001) 4 2378.719
We cannot see any improvement.
At the moment, the best model (according to AIC) seems to be the one with just the single linear contributions of the two variables. Let’s analyze the residuals:
fit.linear <- lm(TPY ~ NUMBED + SQRFOOT, data = data_2001)
summary(fit.linear)
##
## Call:
## lm(formula = TPY ~ NUMBED + SQRFOOT, data = data_2001)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.816 -2.385 0.837 3.514 32.744
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.20766 0.89904 -0.231 0.817
## NUMBED 0.88207 0.01379 63.985 < 2e-16 ***
## SQRFOOT 0.08055 0.01941 4.151 4.19e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.392 on 343 degrees of freedom
## Multiple R-squared: 0.9733, Adjusted R-squared: 0.9732
## F-statistic: 6259 on 2 and 343 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(fit.linear)
The residuals are not exactly as we would expect from a good linear model. In particular there seem to be a couple of outliers, the residuals are not normally distributed on the edges and homoscedasticity is not satisfied.
Let’s also try to inspect the residuals’ plots for the other good models (according to AIC) we got previously:
par(mfrow = c(2, 2))
plot(lm(TPY ~ NUMBED, data = data_2001))
par(mfrow = c(2, 2))
plot(lm(TPY ~ NUMBED + SQRFOOT + NUMBED:SQRFOOT, data = data_2001))
par(mfrow = c(2, 2))
plot(lm(TPY ~ NUMBED + NUMBED:SQRFOOT, data = data_2001))
par(mfrow = c(2, 2))
plot(lm(TPY ~ NUMBED + log(SQRFOOT), data = data_2001))
None of the models above seems to satisfy the assumptions on the linear model.
Let’s now look at the distribution of the data:
par(mfrow = c(2, 2))
hist(data_2001$TPY, xlab = "TPY", freq = TRUE)
hist(data_2001$NUMBED, xlab = "NUMBED", freq = TRUE)
hist(data_2001$SQRFOOT, xlab = "SQRFOOT", freq = TRUE)
As we can see all of the variables are strongly skewed.
First let’s see the correlation:
## fare ggpairs() con il log delle variabili
We see that also TPY is strongly skewed, therefore we can try to model its log-transform:
summary(lm(log(TPY) ~ log(NUMBED) + log(SQRFOOT) + log(NUMBED):log(SQRFOOT), data = data_2001))
##
## Call:
## lm(formula = log(TPY) ~ log(NUMBED) + log(SQRFOOT) + log(NUMBED):log(SQRFOOT),
## data = data_2001)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.87833 -0.02192 0.01586 0.05139 0.27106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.170946 0.204398 -0.836 0.404
## log(NUMBED) 0.996138 0.048397 20.583 <2e-16 ***
## log(SQRFOOT) 0.032319 0.057250 0.565 0.573
## log(NUMBED):log(SQRFOOT) -0.001251 0.012330 -0.101 0.919
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09381 on 342 degrees of freedom
## Multiple R-squared: 0.9651, Adjusted R-squared: 0.9648
## F-statistic: 3155 on 3 and 342 DF, p-value: < 2.2e-16
anova(glm(log(TPY) ~ log(NUMBED) + log(SQRFOOT) + log(NUMBED):log(SQRFOOT), data = data_2001,
family = gaussian), test = "Chisq")
## Analysis of Deviance Table
##
## Model: gaussian, link: identity
##
## Response: log(TPY)
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 345 86.308
## log(NUMBED) 1 83.268 344 3.040 < 2e-16 ***
## log(SQRFOOT) 1 0.030 343 3.010 0.06415 .
## log(NUMBED):log(SQRFOOT) 1 0.000 342 3.010 0.91920
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
In this case even the main effect of SQRFOOT seems to be non relevant. Let’s give a look at the plot of the model with just log(NUMBED) and with just log(SQRFOOT):
summary(lm(log(TPY) ~ log(NUMBED), data = data_2001))
##
## Call:
## lm(formula = log(TPY) ~ log(NUMBED), data = data_2001)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.87488 -0.02220 0.01507 0.05291 0.28856
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.17468 0.04706 -3.711 0.00024 ***
## log(NUMBED) 1.01924 0.01050 97.071 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.094 on 344 degrees of freedom
## Multiple R-squared: 0.9648, Adjusted R-squared: 0.9647
## F-statistic: 9423 on 1 and 344 DF, p-value: < 2.2e-16
summary(lm(log(TPY) ~ log(SQRFOOT), data = data_2001))
##
## Call:
## lm(formula = log(TPY) ~ log(SQRFOOT), data = data_2001)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.67815 -0.16207 0.02277 0.18601 1.05067
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.88032 0.09667 19.45 <2e-16 ***
## log(SQRFOOT) 0.66841 0.02564 26.07 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2904 on 344 degrees of freedom
## Multiple R-squared: 0.6639, Adjusted R-squared: 0.663
## F-statistic: 679.6 on 1 and 344 DF, p-value: < 2.2e-16
And to their residuals:
par(mfrow = c(2, 2))
plot(lm(log(TPY) ~ log(NUMBED) + log(SQRFOOT) + log(PREV_UTILIZATION_RATE), data = data_2001))
par(mfrow = c(2, 2))
plot(lm(log(TPY) ~ log(SQRFOOT), data = data_2001))
We look at the AIC too:
AIC(lm(log(TPY) ~ log(NUMBED), data = data_2001),
lm(log(TPY) ~ log(SQRFOOT), data = data_2001),
lm(log(TPY) ~ log(NUMBED) + log(SQRFOOT), data = data_2001))
## df AIC
## lm(log(TPY) ~ log(NUMBED), data = data_2001) 3 -650.2733
## lm(log(TPY) ~ log(SQRFOOT), data = data_2001) 3 130.1727
## lm(log(TPY) ~ log(NUMBED) + log(SQRFOOT), data = data_2001) 4 -651.7229
Again NUMBED seems to be a good predictor, but the residuals suggest that the assumption on the linear model are not satisfied.
On the other hand SQRFOOT seems to be less relevant but with mush better residuals (apart from the high leverage outlier 331).
We can also try a kind of mixed model:
summary(lm(log(TPY) ~ NUMBED + log(SQRFOOT), data = data_2001))
##
## Call:
## lm(formula = log(TPY) ~ NUMBED + log(SQRFOOT), data = data_2001)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.20596 -0.06471 0.04114 0.10897 0.37224
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.8948843 0.0747232 38.741 < 2e-16 ***
## NUMBED 0.0076163 0.0003292 23.135 < 2e-16 ***
## log(SQRFOOT) 0.1982214 0.0258945 7.655 1.98e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1817 on 343 degrees of freedom
## Multiple R-squared: 0.8688, Adjusted R-squared: 0.868
## F-statistic: 1135 on 2 and 343 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(lm(log(TPY) ~ NUMBED + log(SQRFOOT), data = data_2001))
AIC(lm(log(TPY) ~ NUMBED + log(SQRFOOT), data = data_2001))
## [1] -193.1341
But the results are not encouraging.
To fix residuals we can also try:
summary(lm(I(1/TPY) ~ NUMBED + SQRFOOT, data = data_2001))
##
## Call:
## lm(formula = I(1/TPY) ~ NUMBED + SQRFOOT, data = data_2001)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.005553 -0.002945 -0.001930 0.000552 0.056076
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.708e-02 7.253e-04 37.338 <2e-16 ***
## NUMBED -1.403e-04 1.112e-05 -12.616 <2e-16 ***
## SQRFOOT 1.858e-05 1.566e-05 1.187 0.236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.005964 on 343 degrees of freedom
## Multiple R-squared: 0.5237, Adjusted R-squared: 0.5209
## F-statistic: 188.6 on 2 and 343 DF, p-value: < 2.2e-16
summary(lm(I(sqrt(TPY)) ~ NUMBED + SQRFOOT, data = data_2001))
##
## Call:
## lm(formula = I(sqrt(TPY)) ~ NUMBED + SQRFOOT, data = data_2001)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6447 -0.2063 0.1194 0.3320 1.8444
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.746642 0.068224 69.58 <2e-16 ***
## NUMBED 0.044967 0.001046 42.98 <2e-16 ***
## SQRFOOT 0.001384 0.001473 0.94 0.348
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.561 on 343 degrees of freedom
## Multiple R-squared: 0.939, Adjusted R-squared: 0.9386
## F-statistic: 2640 on 2 and 343 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(lm(I(1/TPY) ~ NUMBED + SQRFOOT, data = data_2001))
par(mfrow = c(2, 2))
plot(lm(I(sqrt(TPY)) ~ NUMBED + SQRFOOT, data = data_2001))
the results are even worse than before.
By using the numerical variables only we were not able to obtain a good model. In particular it was not possible to build a model in which all the characteristic assumptions for linear models were met. We can then try to improve our model by adding the categorical predictors.
First of all we can observe two things about the categorical variables:
MSA specifies the information given by URBAN (therefore they will be alternative variables in the model);
Information expressed by PRO e TAXEXEMPT is implicit in ORGSTR.
We can also visualize the previous two observations by graphs:
### roba fatta da Simone
For the moment let’s exclude the variable MCERT and use the most “generic” variables. As a baseline we take the linear model with the higher AIC score.
summary(lm(TPY ~ NUMBED + SQRFOOT + TAXEXEMPT + SELFFUNDINS + URBAN + PRO, data = data_2001))
##
## Call:
## lm(formula = TPY ~ NUMBED + SQRFOOT + TAXEXEMPT + SELFFUNDINS +
## URBAN + PRO, data = data_2001)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.033 -2.389 0.887 3.678 32.453
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.30487 1.65318 -0.184 0.85380
## NUMBED 0.88749 0.01434 61.888 < 2e-16 ***
## SQRFOOT 0.07476 0.02110 3.544 0.00045 ***
## TAXEXEMPT 0.87293 1.40626 0.621 0.53519
## SELFFUNDINS 0.39001 0.83297 0.468 0.63993
## URBAN -1.16575 0.83217 -1.401 0.16217
## PRO -0.17355 1.44099 -0.120 0.90421
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.393 on 339 degrees of freedom
## Multiple R-squared: 0.9736, Adjusted R-squared: 0.9732
## F-statistic: 2086 on 6 and 339 DF, p-value: < 2.2e-16
anova(glm(TPY ~ NUMBED + SQRFOOT + TAXEXEMPT + SELFFUNDINS + URBAN + PRO, data = data_2001,
family = gaussian), test = "Chisq")
## Analysis of Deviance Table
##
## Model: gaussian, link: identity
##
## Response: TPY
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 345 702732
## NUMBED 1 683048 344 19684 < 2.2e-16 ***
## SQRFOOT 1 941 343 18743 3.327e-05 ***
## TAXEXEMPT 1 86 342 18657 0.2105
## SELFFUNDINS 1 14 341 18643 0.6145
## URBAN 1 112 340 18531 0.1522
## PRO 1 1 339 18531 0.9041
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2, 2))
plot(lm(TPY ~ NUMBED + SQRFOOT + TAXEXEMPT + SELFFUNDINS + URBAN + PRO,
data = data_2001))
Let’s now try with the log-transformed variables:
summary(lm(log(TPY) ~ log(NUMBED) + log(SQRFOOT) + TAXEXEMPT + SELFFUNDINS + URBAN + PRO,
data = data_2001))
##
## Call:
## lm(formula = log(TPY) ~ log(NUMBED) + log(SQRFOOT) + TAXEXEMPT +
## SELFFUNDINS + URBAN + PRO, data = data_2001)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.86398 -0.02304 0.01933 0.04979 0.27216
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.170131 0.053410 -3.185 0.00158 **
## log(NUMBED) 1.002178 0.018924 52.957 < 2e-16 ***
## log(SQRFOOT) 0.019028 0.015588 1.221 0.22307
## TAXEXEMPT 0.019875 0.017663 1.125 0.26127
## SELFFUNDINS 0.003665 0.010536 0.348 0.72815
## URBAN -0.015259 0.010433 -1.463 0.14452
## PRO -0.001832 0.017927 -0.102 0.91868
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09333 on 339 degrees of freedom
## Multiple R-squared: 0.9658, Adjusted R-squared: 0.9652
## F-statistic: 1595 on 6 and 339 DF, p-value: < 2.2e-16
anova(glm(log(TPY) ~ log(NUMBED) + log(SQRFOOT) + TAXEXEMPT + SELFFUNDINS + URBAN + PRO,
data = data_2001, family = gaussian), test = "Chisq")
## Analysis of Deviance Table
##
## Model: gaussian, link: identity
##
## Response: log(TPY)
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 345 86.308
## log(NUMBED) 1 83.268 344 3.040 < 2e-16 ***
## log(SQRFOOT) 1 0.030 343 3.010 0.06278 .
## TAXEXEMPT 1 0.036 342 2.973 0.04070 *
## SELFFUNDINS 1 0.001 341 2.972 0.69959
## URBAN 1 0.019 340 2.953 0.13655
## PRO 1 0.000 339 2.953 0.91862
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2, 2))
plot(lm(log(TPY) ~ log(NUMBED) + log(SQRFOOT) + TAXEXEMPT + SELFFUNDINS + URBAN + PRO,
data = data_2001))
Better, but still the outliers are a problem.
Notice that TAXEXEMPT seems to be the only relevant categorical variable. let’s try to put it as last to check:
anova(glm(log(TPY) ~ log(NUMBED) + log(SQRFOOT) + SELFFUNDINS + URBAN + PRO + TAXEXEMPT,
data = data_2001, family = gaussian), test = "Chisq")
## Analysis of Deviance Table
##
## Model: gaussian, link: identity
##
## Response: log(TPY)
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 345 86.308
## log(NUMBED) 1 83.268 344 3.040 < 2e-16 ***
## log(SQRFOOT) 1 0.030 343 3.010 0.06278 .
## SELFFUNDINS 1 0.001 342 3.009 0.72424
## URBAN 1 0.023 341 2.986 0.10757
## PRO 1 0.023 340 2.964 0.10796
## TAXEXEMPT 1 0.011 339 2.953 0.26047
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The only categorical variables that seem to be significant are then PRO and TAXEXEMPT.
In the end the only acceptable model (that is: the only model whose residuals’ plots confirm that the linear models’ hypoteses are met) seems to be the following.
par(mfrow=c(2,2))
fit.best <- lm(log(TPY) ~ log(SQRFOOT) + TAXEXEMPT + PREV_UTILIZATION_RATE, data = data_2001)
summary(fit.best)
##
## Call:
## lm(formula = log(TPY) ~ log(SQRFOOT) + TAXEXEMPT + PREV_UTILIZATION_RATE,
## data = data_2001)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.34987 -0.14881 0.00171 0.17431 1.01927
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.10304 0.17397 6.340 7.25e-10 ***
## log(SQRFOOT) 0.67531 0.02473 27.306 < 2e-16 ***
## TAXEXEMPT -0.14240 0.03144 -4.529 8.21e-06 ***
## PREV_UTILIZATION_RATE 0.88314 0.17138 5.153 4.34e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2753 on 342 degrees of freedom
## Multiple R-squared: 0.6997, Adjusted R-squared: 0.6971
## F-statistic: 265.7 on 3 and 342 DF, p-value: < 2.2e-16
plot(fit.best)
anova(fit.best, test = "Chisq")
## Analysis of Variance Table
##
## Response: log(TPY)
## Df Sum Sq Mean Sq F value Pr(>F)
## log(SQRFOOT) 1 57.304 57.304 756.266 < 2.2e-16 ***
## TAXEXEMPT 1 1.078 1.078 14.229 0.0001907 ***
## PREV_UTILIZATION_RATE 1 2.012 2.012 26.554 4.34e-07 ***
## Residuals 342 25.914 0.076
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2, 2))
plot(fit.best)
Here we try to use 2000’s UTILIZATION_RATE to filter out outliers from 2001’s data.
First we plot the model for all 2001 data:
summary(lm(log(TPY) ~ log(NUMBED) + TAXEXEMPT, data = data_2001))
##
## Call:
## lm(formula = log(TPY) ~ log(NUMBED) + TAXEXEMPT, data = data_2001)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.86517 -0.02326 0.01758 0.04951 0.29827
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.18386 0.04685 -3.924 0.000105 ***
## log(NUMBED) 1.01913 0.01042 97.799 < 2e-16 ***
## TAXEXEMPT 0.02594 0.01037 2.501 0.012837 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09329 on 343 degrees of freedom
## Multiple R-squared: 0.9654, Adjusted R-squared: 0.9652
## F-statistic: 4787 on 2 and 343 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(lm(log(TPY) ~ log(NUMBED) + TAXEXEMPT, data = data_2001))
data_2001_trunc <- data_2001[data_2001$PREV_UTILIZATION_RATE > 0.85 &
data_2001$PREV_UTILIZATION_RATE < 1.07, ]
summary(lm(log(TPY) ~ log(NUMBED) + TAXEXEMPT, data = data_2001_trunc))
##
## Call:
## lm(formula = log(TPY) ~ log(NUMBED) + TAXEXEMPT, data = data_2001_trunc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.218415 -0.024751 0.007205 0.033057 0.282089
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.087348 0.029536 -2.957 0.003352 **
## log(NUMBED) 1.002350 0.006562 152.750 < 2e-16 ***
## TAXEXEMPT 0.021116 0.006317 3.343 0.000936 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05327 on 297 degrees of freedom
## Multiple R-squared: 0.9875, Adjusted R-squared: 0.9874
## F-statistic: 1.169e+04 on 2 and 297 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(lm(log(TPY) ~ log(NUMBED) + TAXEXEMPT, data = data_2001_trunc))
We will use the interval found for 2000 to improve the model: